AAQoL machine learning analysis with unbalanced random forest

Author

Miguel Fudolig

library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(glmnet)
library(randomForestSRC)
library(caret)
library(ggRandomForests)

Data set

This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.

Input data set

qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above")))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Demographics

ps(Ethnicity)
# A tibble: 7 × 3
  Ethnicity        n     pct
  <fct>        <int>   <dbl>
1 Chinese        639 24.5   
2 Asian Indian   574 22.0   
3 Filipino       265 10.2   
4 Korean         471 18.1   
5 Other          144  5.52  
6 Vietnamese     514 19.7   
7 <NA>             2  0.0767
ps(Gender)
# A tibble: 3 × 3
  Gender     n   pct
  <fct>  <int> <dbl>
1 Female  1425 54.6 
2 Male    1157 44.3 
3 <NA>      27  1.03
ps(Religion)
# A tibble: 8 × 3
  Religion       n    pct
  <fct>      <int>  <dbl>
1 Buddhist     350 13.4  
2 Catholic     492 18.9  
3 Hindu        479 18.4  
4 Muslim        68  2.61 
5 None         506 19.4  
6 Other         47  1.80 
7 Protestant   645 24.7  
8 <NA>          22  0.843
ps(`Full Time Employment`)
# A tibble: 3 × 3
  `Full Time Employment`     n    pct
  <fct>                  <int>  <dbl>
1 0                       1458 55.9  
2 Employed full time      1144 43.8  
3 <NA>                       7  0.268
ps(Income)
# A tibble: 9 × 3
  Income                n   pct
  <fct>             <int> <dbl>
1 $0 - $9,999         254  9.74
2 $10,000 - $19,999   205  7.86
3 $20,000 - $29,999   198  7.59
4 $30,000 - $39,999   207  7.93
5 $40,000 - $49,999   181  6.94
6 $50,000 - $59,999   178  6.82
7 $60,000 - $69,999   190  7.28
8 $70,000 and over    993 38.1 
9 <NA>                203  7.78
ps(`English Speaking`)
# A tibble: 5 × 3
  `English Speaking`     n    pct
  <fct>              <int>  <dbl>
1 Not at all           177  6.78 
2 Not well             632 24.2  
3 Very well            974 37.3  
4 Well                 808 31.0  
5 <NA>                  18  0.690
ps(`English Difficulties`)
# A tibble: 5 × 3
  `English Difficulties`     n   pct
  <fct>                  <int> <dbl>
1 Not at all               772 29.6 
2 Much                     549 21.0 
3 Not much                 733 28.1 
4 Very much                516 19.8 
5 <NA>                      39  1.49
ps(Discrimination)
# A tibble: 3 × 3
  Discrimination     n   pct
           <dbl> <int> <dbl>
1              0  1598  61.2
2              1   694  26.6
3             NA   317  12.2
qol |> summarize(age_mean = mean(Age,na.rm=T),
                 age_sd = sd(Age,na.rm=T),
                 age_min = min(Age,na.rm=T),
                 age_max = max(Age,na.rm=T))
# A tibble: 1 × 4
  age_mean age_sd age_min age_max
     <dbl>  <dbl>   <dbl>   <dbl>
1     42.9   17.1      18      98

Source of Information: Family

ps(Family)
# A tibble: 4 × 3
  Family     n     pct
  <fct>  <int>   <dbl>
1 3          1  0.0383
2 No      1258 48.2   
3 Yes     1331 51.0   
4 <NA>      19  0.728 
rfdata <- qol |> filter(Family %in% c("No","Yes")) |> 
  mutate(Family=droplevels(Family)) |> 
  select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  # filter(!is.na(Family)) |> 
  na.omit() |>
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc",method="brf")
print(rfobj)
                         Sample size: 1926
           Frequency of class labels: 928, 998
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 528.373
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swr
    Resample size used to grow trees: 1856
                            Analysis: RF-C
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0754
                   (OOB) Brier score: 0.23115124
        (OOB) Normalized Brier score: 0.92460495
                           (OOB) AUC: 0.65014728
                      (OOB) Log-loss: 0.65340355
                        (OOB) PR-AUC: 0.61312168
                        (OOB) G-mean: 0.59612122
   (OOB) Requested performance error: 0.40387878

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  593 335      0.3610
       Yes 443 555      0.4439

      (OOB) Misclassification rate: 0.403946
print(rfobj)
                         Sample size: 1926
           Frequency of class labels: 928, 998
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 528.373
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swr
    Resample size used to grow trees: 1856
                            Analysis: RF-C
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0754
                   (OOB) Brier score: 0.23115124
        (OOB) Normalized Brier score: 0.92460495
                           (OOB) AUC: 0.65014728
                      (OOB) Log-loss: 0.65340355
                        (OOB) PR-AUC: 0.61312168
                        (OOB) G-mean: 0.59612122
   (OOB) Requested performance error: 0.40387878

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  593 335      0.3610
       Yes 443 555      0.4439

      (OOB) Misclassification rate: 0.403946
plot(rfobj,plots.one.page = FALSE)


                              all   No   Yes
Age                        0.0126   NA    NA
Ethnicity                  0.0070   NA    NA
EnglishDiff                0.0048   NA    NA
Family.Respect             0.0045   NA    NA
Similar.Values             0.0019   NA    NA
Dental.Insurance           0.0015   NA    NA
Loyalty                    0.0007   NA    NA
Gender                    -0.0015   NA    NA
Spend.Time.Together       -0.0017   NA    NA
Successful.Family         -0.0017   NA    NA
EnglishSpeak              -0.0021   NA    NA
Togetherness              -0.0022   NA    NA
Feel.Close                -0.0024   NA    NA
Discrimination            -0.0031   NA    NA
Family.Pride              -0.0031   NA    NA
Get.Along                 -0.0037   NA    NA
Close.knit.Community      -0.0043   NA    NA
Helpful.Family            -0.0045   NA    NA
See.Friends               -0.0046   NA    NA
Health.Insurance          -0.0052   NA    NA
Trust                     -0.0058   NA    NA
Income_median             -0.0058   NA    NA
Community.Shares.Values   -0.0062   NA    NA
Religious.Attendance      -0.0065   NA    NA
Employment                -0.0065   NA    NA
Helpful.Community         -0.0067   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity                0.0069551564 NA  NA
Age                      0.0125900668 NA  NA
Gender                  -0.0014680505 NA  NA
Religion                -0.0084388205 NA  NA
Employment              -0.0064719112 NA  NA
Income_median           -0.0058265061 NA  NA
EnglishSpeak            -0.0021443278 NA  NA
EnglishDiff              0.0048111707 NA  NA
See.Family              -0.0100360757 NA  NA
Close.Family            -0.0075970273 NA  NA
Helpful.Family          -0.0044823810 NA  NA
See.Friends             -0.0045875412 NA  NA
Close.Friends           -0.0121965006 NA  NA
Helpful.Friends         -0.0142166122 NA  NA
Family.Respect           0.0044917030 NA  NA
Similar.Values           0.0019048933 NA  NA
Successful.Family       -0.0016974746 NA  NA
Trust                   -0.0057673093 NA  NA
Loyalty                  0.0007367428 NA  NA
Family.Pride            -0.0031190247 NA  NA
Expression              -0.0080815145 NA  NA
Spend.Time.Together     -0.0016685756 NA  NA
Feel.Close              -0.0024267114 NA  NA
Togetherness            -0.0021741062 NA  NA
Religious.Attendance    -0.0064522008 NA  NA
Religious.Importance    -0.0089717074 NA  NA
Close.knit.Community    -0.0043313496 NA  NA
Helpful.Community       -0.0066671893 NA  NA
Community.Shares.Values -0.0062380318 NA  NA
Get.Along               -0.0037187432 NA  NA
Community.Trust         -0.0093222463 NA  NA
Health.Insurance        -0.0052315829 NA  NA
Dental.Insurance         0.0014789379 NA  NA
Discrimination          -0.0030703725 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Cross validation in Random Forests (Run when you have time)

# myTrainingControl <- trainControl(method = "repeatedcv",
#                                   number = 10,                   
#                                   repeats = 3,                 
#                                   savePredictions = TRUE,
#                                   classProbs = TRUE,
#                                   verboseIter = TRUE,
#                                   search = "grid")
# 
# 
# set.seed(123)
# 
# model_rf <- train(Family~ .,
#                   data=rfdata,
#                   method = 'rf',
#                   metric = "Accuracy",             
#                   trControl = myTrainingControl,       
#                   importance = TRUE                 
#                   )
# 
# varImp(model_rf)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- caret::createDataPartition(rfdata$Family,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Family~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]
# 
# rfsrc(Family~.,data=train, importance="permute", 
#       perf.type="gmean",
#       splitrule="auc",
#       block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="auc")

print(rfobj)
                         Sample size: 1542
           Frequency of class labels: 756, 786
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 335.6223
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 975
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0397
                   (OOB) Brier score: 0.18671405
        (OOB) Normalized Brier score: 0.74685618
                           (OOB) AUC: 0.85810631
                      (OOB) Log-loss: 0.5603946
                        (OOB) PR-AUC: 0.85597293
                        (OOB) G-mean: 0.77922288
   (OOB) Requested performance error: 0.22077712

Confusion matrix:

          predicted
  observed Yes  No class.error
       Yes 581 175      0.2315
       No  165 621      0.2099

      (OOB) Misclassification rate: 0.2204929
plot(rfobj,plots.one.page = FALSE)


                          all   Yes   No
Ethnicity              0.0304    NA   NA
Discrimination         0.0243    NA   NA
Religion               0.0228    NA   NA
EnglishDiff            0.0207    NA   NA
Close.Family           0.0202    NA   NA
Religious.Attendance   0.0188    NA   NA
Age                    0.0185    NA   NA
Get.Along              0.0160    NA   NA
Spend.Time.Together    0.0155    NA   NA
Income_median          0.0155    NA   NA
See.Family             0.0154    NA   NA
Gender                 0.0138    NA   NA
Religious.Importance   0.0137    NA   NA
Employment             0.0135    NA   NA
See.Friends            0.0130    NA   NA
EnglishSpeak           0.0128    NA   NA
Successful.Family      0.0127    NA   NA
Similar.Values         0.0122    NA   NA
Close.Friends          0.0117    NA   NA
Togetherness           0.0113    NA   NA
Trust                  0.0104    NA   NA
Close.knit.Community   0.0095    NA   NA
Helpful.Community      0.0091    NA   NA
Feel.Close             0.0087    NA   NA
Expression             0.0078    NA   NA
Dental.Insurance       0.0077    NA   NA
rfobj$importance
                                  all Yes No
Ethnicity                3.043835e-02  NA NA
Age                      1.847720e-02  NA NA
Gender                   1.379629e-02  NA NA
Religion                 2.283324e-02  NA NA
Employment               1.352591e-02  NA NA
Income_median            1.545365e-02  NA NA
EnglishSpeak             1.275595e-02  NA NA
EnglishDiff              2.072475e-02  NA NA
See.Family               1.538315e-02  NA NA
Close.Family             2.018801e-02  NA NA
Helpful.Family           6.238437e-03  NA NA
See.Friends              1.302496e-02  NA NA
Close.Friends            1.168193e-02  NA NA
Helpful.Friends          6.299399e-03  NA NA
Family.Respect           5.191944e-03  NA NA
Similar.Values           1.219078e-02  NA NA
Successful.Family        1.272191e-02  NA NA
Trust                    1.042880e-02  NA NA
Loyalty                  3.143308e-03  NA NA
Family.Pride             3.182341e-03  NA NA
Expression               7.832653e-03  NA NA
Spend.Time.Together      1.553298e-02  NA NA
Feel.Close               8.655568e-03  NA NA
Togetherness             1.130709e-02  NA NA
Religious.Attendance     1.884340e-02  NA NA
Religious.Importance     1.374572e-02  NA NA
Close.knit.Community     9.496852e-03  NA NA
Helpful.Community        9.130726e-03  NA NA
Community.Shares.Values  6.267829e-03  NA NA
Get.Along                1.601573e-02  NA NA
Community.Trust          5.738946e-03  NA NA
Health.Insurance        -4.211306e-05  NA NA
Dental.Insurance         7.745393e-03  NA NA
Discrimination           2.429195e-02  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

ggsave(filename="family_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
199.0000000 185.0000000   1.0756757   0.4817708   0.5351351   0.5226131 
       prec         npv    misclass       brier  brier.norm         auc 
  0.5103093   0.5473684   0.4713542   0.2451071   0.9804283   0.5738693 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.6832185   0.5224274   0.5284946   0.4817708   0.5537617   0.5256322 
 F1modgmean       gmean 
  0.5286658   0.5288370 
test_rf$importance
                                  all Yes No
Ethnicity                7.773687e-03  NA NA
Age                      1.391193e-02  NA NA
Gender                   1.481661e-03  NA NA
Religion                -6.770110e-05  NA NA
Employment               3.369657e-04  NA NA
Income_median            1.341765e-04  NA NA
EnglishSpeak             1.737331e-03  NA NA
EnglishDiff              1.686164e-04  NA NA
See.Family               3.991785e-04  NA NA
Close.Family            -3.209117e-03  NA NA
Helpful.Family           1.130363e-03  NA NA
See.Friends              1.588099e-03  NA NA
Close.Friends            1.389035e-03  NA NA
Helpful.Friends         -1.563947e-03  NA NA
Family.Respect          -7.517450e-04  NA NA
Similar.Values          -3.947004e-04  NA NA
Successful.Family        1.621815e-03  NA NA
Trust                   -9.002110e-04  NA NA
Loyalty                 -1.147870e-04  NA NA
Family.Pride             1.106295e-03  NA NA
Expression              -1.092909e-03  NA NA
Spend.Time.Together     -8.849227e-04  NA NA
Feel.Close              -3.067122e-05  NA NA
Togetherness             5.981813e-05  NA NA
Religious.Attendance     5.583681e-04  NA NA
Religious.Importance    -5.044237e-04  NA NA
Close.knit.Community    -4.833768e-04  NA NA
Helpful.Community       -8.145869e-04  NA NA
Community.Shares.Values -2.730269e-04  NA NA
Get.Along                1.300196e-03  NA NA
Community.Trust         -5.415171e-04  NA NA
Health.Insurance         1.750978e-04  NA NA
Dental.Insurance         1.569785e-03  NA NA
Discrimination           8.925712e-04  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

ggsave(filename="family_test_VIMP.png",width=5,height=5,units="in")

Source of Information: Health Professionals

ps(`Heal Professionals`)
# A tibble: 3 × 3
  `Heal Professionals`     n    pct
  <fct>                <int>  <dbl>
1 No                    1326 50.8  
2 Yes                   1264 48.4  
3 <NA>                    19  0.728
rfdata <- qol |> 
  select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imbalanced(Heal.Professionals ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")->rfobj

print(rfobj)
                         Sample size: 1927
           Frequency of class labels: 925, 1002
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 529.8617
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1218
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0832
                   (OOB) Brier score: 0.22739259
        (OOB) Normalized Brier score: 0.90957038
                           (OOB) AUC: 0.67375951
                      (OOB) Log-loss: 0.64656947
                        (OOB) PR-AUC: 0.63055041
                        (OOB) G-mean: 0.62051216
   (OOB) Requested performance error: 0.37948784

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  562 363      0.3924
       Yes 367 635      0.3663

      (OOB) Misclassification rate: 0.3788272
plot(rfobj,plots.one.page = FALSE)


                              all   No   Yes
EnglishSpeak               0.0087   NA    NA
Get.Along                  0.0077   NA    NA
Community.Shares.Values    0.0076   NA    NA
Spend.Time.Together        0.0068   NA    NA
Expression                 0.0065   NA    NA
Gender                     0.0064   NA    NA
Age                        0.0056   NA    NA
Similar.Values             0.0049   NA    NA
Health.Insurance           0.0049   NA    NA
Feel.Close                 0.0048   NA    NA
Community.Trust            0.0045   NA    NA
Family.Pride               0.0045   NA    NA
Income_median              0.0041   NA    NA
Discrimination             0.0039   NA    NA
Dental.Insurance           0.0034   NA    NA
Helpful.Community          0.0031   NA    NA
Family.Respect             0.0030   NA    NA
Trust                      0.0026   NA    NA
Close.knit.Community       0.0012   NA    NA
Loyalty                    0.0006   NA    NA
Religious.Importance      -0.0008   NA    NA
Togetherness              -0.0008   NA    NA
Ethnicity                 -0.0018   NA    NA
Religious.Attendance      -0.0021   NA    NA
Religion                  -0.0023   NA    NA
Helpful.Family            -0.0023   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity               -0.0018143542 NA  NA
Age                      0.0056014941 NA  NA
Gender                   0.0063757609 NA  NA
Religion                -0.0022995991 NA  NA
Employment              -0.0024295113 NA  NA
Income_median            0.0041083863 NA  NA
EnglishSpeak             0.0086779884 NA  NA
EnglishDiff             -0.0028554330 NA  NA
See.Family              -0.0040666801 NA  NA
Close.Family            -0.0026329858 NA  NA
Helpful.Family          -0.0023203869 NA  NA
See.Friends             -0.0059329832 NA  NA
Close.Friends           -0.0071864210 NA  NA
Helpful.Friends         -0.0065896944 NA  NA
Family.Respect           0.0029979134 NA  NA
Similar.Values           0.0049175798 NA  NA
Successful.Family       -0.0029246611 NA  NA
Trust                    0.0026337288 NA  NA
Loyalty                  0.0005523029 NA  NA
Family.Pride             0.0045040927 NA  NA
Expression               0.0065145648 NA  NA
Spend.Time.Together      0.0067966620 NA  NA
Feel.Close               0.0048483539 NA  NA
Togetherness            -0.0008427275 NA  NA
Religious.Attendance    -0.0020812857 NA  NA
Religious.Importance    -0.0007880284 NA  NA
Close.knit.Community     0.0012392363 NA  NA
Helpful.Community        0.0030590686 NA  NA
Community.Shares.Values  0.0075557164 NA  NA
Get.Along                0.0077018301 NA  NA
Community.Trust          0.0045040927 NA  NA
Health.Insurance         0.0049053114 NA  NA
Dental.Insurance         0.0033728024 NA  NA
Discrimination           0.0039211271 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(Heal.Professionals=="Yes")
neg <- rfdata |> filter(Heal.Professionals==0)

set.seed(222)
imbal_index <- createDataPartition(rfdata$Heal.Professionals,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Heal.Professionals~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Heal.Professionals ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="auc")
print(rfobj)
                         Sample size: 1542
           Frequency of class labels: 756, 786
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 323.5823
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 975
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0397
                   (OOB) Brier score: 0.17475468
        (OOB) Normalized Brier score: 0.6990187
                           (OOB) AUC: 0.86249361
                      (OOB) Log-loss: 0.53255883
                        (OOB) PR-AUC: 0.8529244
                        (OOB) G-mean: 0.78548997
   (OOB) Requested performance error: 0.21451003

Confusion matrix:

          predicted
  observed Yes  No class.error
       Yes 604 152      0.2011
       No  179 607      0.2277

      (OOB) Misclassification rate: 0.2146563
print(rfobj)
                         Sample size: 1542
           Frequency of class labels: 756, 786
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 323.5823
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 975
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0397
                   (OOB) Brier score: 0.17475468
        (OOB) Normalized Brier score: 0.6990187
                           (OOB) AUC: 0.86249361
                      (OOB) Log-loss: 0.53255883
                        (OOB) PR-AUC: 0.8529244
                        (OOB) G-mean: 0.78548997
   (OOB) Requested performance error: 0.21451003

Confusion matrix:

          predicted
  observed Yes  No class.error
       Yes 604 152      0.2011
       No  179 607      0.2277

      (OOB) Misclassification rate: 0.2146563
plot(rfobj,plots.one.page = FALSE)


                             all   Yes   No
EnglishSpeak              0.0376    NA   NA
Religion                  0.0267    NA   NA
EnglishDiff               0.0248    NA   NA
Religious.Importance      0.0247    NA   NA
Ethnicity                 0.0242    NA   NA
See.Friends               0.0188    NA   NA
Age                       0.0156    NA   NA
Discrimination            0.0143    NA   NA
Religious.Attendance      0.0143    NA   NA
Dental.Insurance          0.0143    NA   NA
Close.knit.Community      0.0130    NA   NA
Helpful.Community         0.0117    NA   NA
Get.Along                 0.0117    NA   NA
Helpful.Family            0.0111    NA   NA
See.Family                0.0110    NA   NA
Close.Family              0.0098    NA   NA
Community.Shares.Values   0.0091    NA   NA
Family.Respect            0.0091    NA   NA
Close.Friends             0.0078    NA   NA
Feel.Close                0.0071    NA   NA
Gender                    0.0071    NA   NA
Community.Trust           0.0065    NA   NA
Togetherness              0.0065    NA   NA
Income_median             0.0058    NA   NA
Employment                0.0058    NA   NA
Helpful.Friends           0.0052    NA   NA
rfobj$importance
                                all Yes No
Ethnicity               0.024183711  NA NA
Age                     0.015586870  NA NA
Gender                  0.007136857  NA NA
Religion                0.026725642  NA NA
Employment              0.005836350  NA NA
Income_median           0.005844984  NA NA
EnglishSpeak            0.037625231  NA NA
EnglishDiff             0.024767514  NA NA
See.Family              0.011034099  NA NA
Close.Family            0.009780208  NA NA
Helpful.Family          0.011077560  NA NA
See.Friends             0.018808992  NA NA
Close.Friends           0.007781435  NA NA
Helpful.Friends         0.005219260  NA NA
Family.Respect          0.009078697  NA NA
Similar.Values          0.004539086  NA NA
Successful.Family       0.003891799  NA NA
Trust                   0.004539086  NA NA
Loyalty                 0.005193379  NA NA
Family.Pride            0.004542318  NA NA
Expression              0.003889646  NA NA
Spend.Time.Together     0.005193379  NA NA
Feel.Close              0.007149830  NA NA
Togetherness            0.006484172  NA NA
Religious.Attendance    0.014274295  NA NA
Religious.Importance    0.024652487  NA NA
Close.knit.Community    0.012970486  NA NA
Helpful.Community       0.011679748  NA NA
Community.Shares.Values 0.009080865  NA NA
Get.Along               0.011673223  NA NA
Community.Trust         0.006484172  NA NA
Health.Insurance        0.001944557  NA NA
Dental.Insurance        0.014267749  NA NA
Discrimination          0.014274295  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

ggsave(filename="healthpro_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
200.0000000 185.0000000   1.0810811   0.4805195   0.6378378   0.6200000 
       prec         npv    misclass       brier  brier.norm         auc 
  0.6082474   0.6492147   0.3714286   0.2345217   0.9380869   0.6493243 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.6617049   0.6226913   0.6284279   0.4805195   0.6120594   0.6257735 
 F1modgmean       gmean 
  0.6286418   0.6288557 
test_rf$importance
                                  all Yes No
Ethnicity                1.491325e-03  NA NA
Age                      3.472339e-03  NA NA
Gender                   4.084452e-06  NA NA
Religion                 1.941187e-03  NA NA
Employment              -3.785221e-07  NA NA
Income_median            1.264392e-03  NA NA
EnglishSpeak             1.904263e-02  NA NA
EnglishDiff              1.101844e-02  NA NA
See.Family               3.399809e-03  NA NA
Close.Family             1.585158e-03  NA NA
Helpful.Family           1.067564e-03  NA NA
See.Friends              2.320842e-03  NA NA
Close.Friends           -7.445241e-04  NA NA
Helpful.Friends          1.773664e-03  NA NA
Family.Respect           1.169300e-03  NA NA
Similar.Values           4.038333e-04  NA NA
Successful.Family       -3.801373e-04  NA NA
Trust                   -2.101630e-04  NA NA
Loyalty                 -3.523717e-04  NA NA
Family.Pride            -2.651401e-04  NA NA
Expression               3.227736e-04  NA NA
Spend.Time.Together      3.551350e-04  NA NA
Feel.Close              -2.444042e-04  NA NA
Togetherness            -7.700216e-04  NA NA
Religious.Attendance     2.228277e-04  NA NA
Religious.Importance     1.200384e-03  NA NA
Close.knit.Community    -2.856467e-04  NA NA
Helpful.Community        7.287934e-04  NA NA
Community.Shares.Values -4.119143e-04  NA NA
Get.Along                4.864171e-04  NA NA
Community.Trust          1.569440e-04  NA NA
Health.Insurance         1.849156e-03  NA NA
Dental.Insurance         5.658109e-03  NA NA
Discrimination           2.479901e-03  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

ggsave(filename="healthpro_test_VIMP.png",width=5,height=5,units="in")

Health Insurance

ps(`Health Insurance`)
# A tibble: 3 × 3
  `Health Insurance`     n    pct
  <fct>              <int>  <dbl>
1 0                    381 14.6  
2 Yes                 2207 84.6  
3 <NA>                  21  0.805

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Health.Insurance ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")
print(imb)
                         Sample size: 1936
           Frequency of class labels: 259, 1677
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 295.831
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1224
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 6.4749
                   (OOB) Brier score: 0.10519154
        (OOB) Normalized Brier score: 0.42076617
                           (OOB) AUC: 0.73382845
                      (OOB) Log-loss: 0.35223368
                        (OOB) PR-AUC: 0.3214195
                        (OOB) G-mean: 0.66878662
   (OOB) Requested performance error: 0.33121338

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   203  56      0.2162
       Yes 720 957      0.4293

      (OOB) Misclassification rate: 0.4008264
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1677.0000000  259.0000000    6.4749035    0.1337810    0.7837838    0.5706619 
        prec          npv     misclass        brier   brier.norm          auc 
   0.2199350    0.9447187    0.4008264    0.1051915    0.4207662    0.7338284 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
   0.3522337    0.3434856    0.4633100    0.1337810    0.3214195    0.5061361 
  F1modgmean        gmean 
   0.5660483    0.6687866 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
# ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
# ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))
# 
# 
# train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
# test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])

imbal_index <- createDataPartition(rfdata$Health.Insurance,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Health.Insurance~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Health.Insurance ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="auc")
print(rfobj)
                         Sample size: 1550
           Frequency of class labels: 760, 790
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 277.4057
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 980
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0395
                   (OOB) Brier score: 0.12398991
        (OOB) Normalized Brier score: 0.49595964
                           (OOB) AUC: 0.96556213
                      (OOB) Log-loss: 0.4175614
                        (OOB) PR-AUC: 0.96399298
                        (OOB) G-mean: 0.89402301
   (OOB) Requested performance error: 0.10597699

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 661  99      0.1303
       0    64 726      0.0810

      (OOB) Misclassification rate: 0.1051613
print(rfobj)
                         Sample size: 1550
           Frequency of class labels: 760, 790
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 277.4057
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 980
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0395
                   (OOB) Brier score: 0.12398991
        (OOB) Normalized Brier score: 0.49595964
                           (OOB) AUC: 0.96556213
                      (OOB) Log-loss: 0.4175614
                        (OOB) PR-AUC: 0.96399298
                        (OOB) G-mean: 0.89402301
   (OOB) Requested performance error: 0.10597699

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 661  99      0.1303
       0    64 726      0.0810

      (OOB) Misclassification rate: 0.1051613
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
EnglishSpeak              0.0218    NA   NA
Religion                  0.0201    NA   NA
EnglishDiff               0.0176    NA   NA
Ethnicity                 0.0143    NA   NA
Income_median             0.0132    NA   NA
Community.Shares.Values   0.0129    NA   NA
Helpful.Community         0.0118    NA   NA
Helpful.Family            0.0115    NA   NA
Close.Family              0.0108    NA   NA
Employment                0.0104    NA   NA
Get.Along                 0.0101    NA   NA
Religious.Importance      0.0094    NA   NA
Religious.Attendance      0.0094    NA   NA
Community.Trust           0.0094    NA   NA
Successful.Family         0.0077    NA   NA
Age                       0.0076    NA   NA
Close.knit.Community      0.0069    NA   NA
Close.Friends             0.0065    NA   NA
Similar.Values            0.0065    NA   NA
Feel.Close                0.0050    NA   NA
Helpful.Friends           0.0046    NA   NA
Togetherness              0.0044    NA   NA
Spend.Time.Together       0.0039    NA   NA
See.Friends               0.0038    NA   NA
Gender                    0.0031    NA   NA
Family.Pride              0.0030    NA   NA
rfobj$importance
                                all Yes  0
Ethnicity               0.014316200  NA NA
Age                     0.007633590  NA NA
Gender                  0.003083911  NA NA
Religion                0.020066035  NA NA
Employment              0.010442500  NA NA
Income_median           0.013158258  NA NA
EnglishSpeak            0.021816327  NA NA
EnglishDiff             0.017565438  NA NA
See.Family              0.002583976  NA NA
Close.Family            0.010779038  NA NA
Helpful.Family          0.011509116  NA NA
See.Friends             0.003816097  NA NA
Close.Friends           0.006521899  NA NA
Helpful.Friends         0.004615363  NA NA
Family.Respect          0.001353556  NA NA
Similar.Values          0.006459971  NA NA
Successful.Family       0.007691842  NA NA
Trust                   0.002645635  NA NA
Loyalty                 0.002645635  NA NA
Family.Pride            0.003028764  NA NA
Expression              0.001907875  NA NA
Spend.Time.Together     0.003937719  NA NA
Feel.Close              0.005049925  NA NA
Togetherness            0.004375695  NA NA
Religious.Attendance    0.009433654  NA NA
Religious.Importance    0.009433654  NA NA
Close.knit.Community    0.006850379  NA NA
Helpful.Community       0.011776207  NA NA
Community.Shares.Values 0.012920046  NA NA
Get.Along               0.010053332  NA NA
Community.Trust         0.009381877  NA NA
Discrimination          0.001482306  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

ggsave(filename="HIns_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
335.0000000  51.0000000   6.5686275   0.1321244   0.5686275   0.5522388 
       prec         npv    misclass       brier  brier.norm         auc 
  0.1620112   0.8937198   0.4455959   0.1114667   0.4458667   0.6172666 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.3763007   0.2521739   0.3682982   0.1321244   0.2359474   0.4062736 
 F1modgmean       gmean 
  0.4643357   0.5603732 
test_rf$importance
                                  all Yes  0
Ethnicity               -0.0017525858  NA NA
Age                     -0.0052636834  NA NA
Gender                   0.0005795693  NA NA
Religion                 0.0036039587  NA NA
Employment               0.0034489475  NA NA
Income_median            0.0270547814  NA NA
EnglishSpeak             0.0025917176  NA NA
EnglishDiff             -0.0066131418  NA NA
See.Family               0.0054740860  NA NA
Close.Family            -0.0044527376  NA NA
Helpful.Family           0.0031669773  NA NA
See.Friends             -0.0011282253  NA NA
Close.Friends            0.0011439600  NA NA
Helpful.Friends          0.0047502384  NA NA
Family.Respect          -0.0003789466  NA NA
Similar.Values          -0.0005181470  NA NA
Successful.Family        0.0002480748  NA NA
Trust                   -0.0019574952  NA NA
Loyalty                  0.0004630898  NA NA
Family.Pride            -0.0003169345  NA NA
Expression               0.0007673952  NA NA
Spend.Time.Together      0.0004440892  NA NA
Feel.Close               0.0014248742  NA NA
Togetherness             0.0011873526  NA NA
Religious.Attendance     0.0024413037  NA NA
Religious.Importance     0.0042524577  NA NA
Close.knit.Community    -0.0019943671  NA NA
Helpful.Community       -0.0012136076  NA NA
Community.Shares.Values -0.0017010414  NA NA
Get.Along               -0.0033894357  NA NA
Community.Trust         -0.0025111518  NA NA
Discrimination          -0.0018204015  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

ggsave(filename="HIns_test_VIMP.png",width=5,height=5,units="in")

Dental Insurance

ps(`Dental Insurance`)
# A tibble: 3 × 3
  `Dental Insurance`     n   pct
  <fct>              <int> <dbl>
1 0                   1050 40.2 
2 Yes                 1529 58.6 
3 <NA>                  30  1.15

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Dental.Insurance ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")
print(imb)
                         Sample size: 1932
           Frequency of class labels: 760, 1172
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 451.0523
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1221
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.5421
                   (OOB) Brier score: 0.17933324
        (OOB) Normalized Brier score: 0.71733296
                           (OOB) AUC: 0.79775799
                      (OOB) Log-loss: 0.53718287
                        (OOB) PR-AUC: 0.71577982
                        (OOB) G-mean: 0.72787095
   (OOB) Requested performance error: 0.27212905

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   572 188      0.2474
       Yes 347 825      0.2961

      (OOB) Misclassification rate: 0.2769151
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1172.0000000  760.0000000    1.5421053    0.3933747    0.7526316    0.7039249 
        prec          npv     misclass        brier   brier.norm          auc 
   0.6224157    0.8144126    0.2769151    0.1793332    0.7173330    0.7977580 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
   0.5371829    0.6813580    0.7163581    0.3933747    0.7157798    0.7046145 
  F1modgmean        gmean 
   0.7221145    0.7278710 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Dental.Insurance,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dental.Insurance~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dental.Insurance ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="auc")
print(rfobj)
                         Sample size: 1546
           Frequency of class labels: 757, 789
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 284.784
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 977
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0423
                   (OOB) Brier score: 0.13865716
        (OOB) Normalized Brier score: 0.55462863
                           (OOB) AUC: 0.91423938
                      (OOB) Log-loss: 0.44785029
                        (OOB) PR-AUC: 0.90865796
                        (OOB) G-mean: 0.84719663
   (OOB) Requested performance error: 0.15280337

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 624 133      0.1757
       0   102 687      0.1293

      (OOB) Misclassification rate: 0.1520052
print(rfobj)
                         Sample size: 1546
           Frequency of class labels: 757, 789
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 284.784
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 977
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0423
                   (OOB) Brier score: 0.13865716
        (OOB) Normalized Brier score: 0.55462863
                           (OOB) AUC: 0.91423938
                      (OOB) Log-loss: 0.44785029
                        (OOB) PR-AUC: 0.90865796
                        (OOB) G-mean: 0.84719663
   (OOB) Requested performance error: 0.15280337

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 624 133      0.1757
       0   102 687      0.1293

      (OOB) Misclassification rate: 0.1520052
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
Income_median             0.0317    NA   NA
EnglishSpeak              0.0308    NA   NA
Employment                0.0242    NA   NA
Age                       0.0211    NA   NA
EnglishDiff               0.0203    NA   NA
Ethnicity                 0.0186    NA   NA
Religion                  0.0155    NA   NA
Helpful.Family            0.0135    NA   NA
Discrimination            0.0104    NA   NA
Religious.Attendance      0.0095    NA   NA
See.Friends               0.0087    NA   NA
Close.Friends             0.0085    NA   NA
Close.knit.Community      0.0082    NA   NA
See.Family                0.0077    NA   NA
Helpful.Friends           0.0075    NA   NA
Community.Shares.Values   0.0070    NA   NA
Religious.Importance      0.0070    NA   NA
Expression                0.0070    NA   NA
Helpful.Community         0.0065    NA   NA
Spend.Time.Together       0.0058    NA   NA
Community.Trust           0.0057    NA   NA
Get.Along                 0.0052    NA   NA
Close.Family              0.0043    NA   NA
Feel.Close                0.0039    NA   NA
Trust                     0.0038    NA   NA
Gender                    0.0032    NA   NA
rfobj$importance
                                  all Yes  0
Ethnicity                0.0186314687  NA NA
Age                      0.0211470094  NA NA
Gender                   0.0032076047  NA NA
Religion                 0.0155455542  NA NA
Employment               0.0241664494  NA NA
Income_median            0.0317198196  NA NA
EnglishSpeak             0.0308302749  NA NA
EnglishDiff              0.0203154072  NA NA
See.Family               0.0077108636  NA NA
Close.Family             0.0043271910  NA NA
Helpful.Family           0.0135096856  NA NA
See.Friends              0.0087236360  NA NA
Close.Friends            0.0084521130  NA NA
Helpful.Friends          0.0074835320  NA NA
Family.Respect           0.0011747097  NA NA
Similar.Values           0.0024124893  NA NA
Successful.Family       -0.0007970438  NA NA
Trust                    0.0037652270  NA NA
Loyalty                  0.0024699663  NA NA
Family.Pride             0.0024699663  NA NA
Expression               0.0069742534  NA NA
Spend.Time.Together      0.0057984113  NA NA
Feel.Close               0.0039498593  NA NA
Togetherness             0.0012954372  NA NA
Religious.Attendance     0.0095077529  NA NA
Religious.Importance     0.0070330389  NA NA
Close.knit.Community     0.0082126069  NA NA
Helpful.Community        0.0064772321  NA NA
Community.Shares.Values  0.0070330389  NA NA
Get.Along                0.0051817764  NA NA
Community.Trust          0.0057377223  NA NA
Discrimination           0.0104276531  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

ggsave(filename="DIns_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
234.0000000 152.0000000   1.5394737   0.3937824   0.8157895   0.6880342 
       prec         npv    misclass       brier  brier.norm         auc 
  0.6294416   0.8518519   0.2616580   0.1838678   0.7354711   0.7966712 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.5503655   0.7106017   0.7350448   0.3937824   0.7006867   0.7298977 
 F1modgmean       gmean 
  0.7421192   0.7491936 
test_rf$importance
                                  all Yes  0
Ethnicity                6.955105e-03  NA NA
Age                      1.049989e-02  NA NA
Gender                   5.533942e-04  NA NA
Religion                 4.172060e-04  NA NA
Employment               1.801953e-02  NA NA
Income_median            6.474625e-02  NA NA
EnglishSpeak             2.222726e-02  NA NA
EnglishDiff              6.401071e-03  NA NA
See.Family               1.218979e-03  NA NA
Close.Family             8.783462e-04  NA NA
Helpful.Family           3.347928e-03  NA NA
See.Friends              8.517619e-04  NA NA
Close.Friends            4.657271e-03  NA NA
Helpful.Friends          4.084200e-03  NA NA
Family.Respect           6.792115e-04  NA NA
Similar.Values           5.809173e-05  NA NA
Successful.Family        8.622585e-04  NA NA
Trust                    1.698674e-03  NA NA
Loyalty                 -4.762300e-04  NA NA
Family.Pride             2.149872e-04  NA NA
Expression               1.251987e-03  NA NA
Spend.Time.Together      1.459530e-03  NA NA
Feel.Close               3.209875e-04  NA NA
Togetherness             4.915094e-04  NA NA
Religious.Attendance    -2.660509e-04  NA NA
Religious.Importance     4.327772e-04  NA NA
Close.knit.Community     2.316384e-03  NA NA
Helpful.Community        8.348674e-04  NA NA
Community.Shares.Values  1.343737e-04  NA NA
Get.Along                1.595639e-03  NA NA
Community.Trust         -7.213593e-04  NA NA
Discrimination          -1.198574e-03  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

ggsave(filename="DIns_test_VIMP.png",width=5,height=5,units="in")

Physical Checkup

ps(`Physical Check-up`)
# A tibble: 3 × 3
  `Physical Check-up`     n   pct
  <fct>               <int> <dbl>
1 0                     833 31.9 
2 Yes                  1740 66.7 
3 <NA>                   36  1.38

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Physical.Check.up ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")
print(imb)
                         Sample size: 1918
           Frequency of class labels: 614, 1304
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 451.908
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1212
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 2.1238
                   (OOB) Brier score: 0.18414117
        (OOB) Normalized Brier score: 0.73656469
                           (OOB) AUC: 0.74369423
                      (OOB) Log-loss: 0.54902518
                        (OOB) PR-AUC: 0.55768871
                        (OOB) G-mean: 0.6964641
   (OOB) Requested performance error: 0.3035359

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   432 182      0.2964
       Yes 405 899      0.3106

      (OOB) Misclassification rate: 0.306048
plot(imb,plots.one.page = F)


                              all    0   Yes
Age                        0.0451   NA    NA
Health.Insurance           0.0367   NA    NA
Dental.Insurance           0.0277   NA    NA
Gender                     0.0172   NA    NA
Income_median              0.0074   NA    NA
EnglishDiff                0.0061   NA    NA
Community.Shares.Values    0.0055   NA    NA
Employment                 0.0055   NA    NA
Discrimination             0.0045   NA    NA
Togetherness               0.0043   NA    NA
EnglishSpeak               0.0035   NA    NA
Helpful.Family             0.0032   NA    NA
Close.knit.Community       0.0029   NA    NA
Religious.Importance       0.0023   NA    NA
Close.Family               0.0019   NA    NA
Get.Along                  0.0019   NA    NA
Religion                   0.0016   NA    NA
See.Family                 0.0016   NA    NA
Ethnicity                  0.0012   NA    NA
Loyalty                    0.0008   NA    NA
Family.Respect             0.0000   NA    NA
Trust                     -0.0001   NA    NA
Religious.Attendance      -0.0005   NA    NA
See.Friends               -0.0008   NA    NA
Community.Trust           -0.0008   NA    NA
Family.Pride              -0.0015   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1304.0000000  614.0000000    2.1237785    0.3201251    0.7035831    0.6894172 
        prec          npv     misclass        brier   brier.norm          auc 
   0.5161290    0.8316374    0.3060480    0.1841412    0.7365647    0.7436942 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
   0.5490252    0.5954514    0.6653643    0.3201251    0.5576887    0.6459578 
  F1modgmean        gmean 
   0.6809142    0.6964641 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Physical.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Physical.Check.up~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Physical.Check.up~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="auc")
print(rfobj)
                         Sample size: 1536
           Frequency of class labels: 754, 782
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 297.9053
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 971
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0371
                   (OOB) Brier score: 0.14734902
        (OOB) Normalized Brier score: 0.58939609
                           (OOB) AUC: 0.9177545
                      (OOB) Log-loss: 0.47059335
                        (OOB) PR-AUC: 0.91289228
                        (OOB) G-mean: 0.84443067
   (OOB) Requested performance error: 0.15556933

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 659  95      0.1260
       0   144 638      0.1841

      (OOB) Misclassification rate: 0.155599
print(rfobj)
                         Sample size: 1536
           Frequency of class labels: 754, 782
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 297.9053
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 971
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0371
                   (OOB) Brier score: 0.14734902
        (OOB) Normalized Brier score: 0.58939609
                           (OOB) AUC: 0.9177545
                      (OOB) Log-loss: 0.47059335
                        (OOB) PR-AUC: 0.91289228
                        (OOB) G-mean: 0.84443067
   (OOB) Requested performance error: 0.15556933

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 659  95      0.1260
       0   144 638      0.1841

      (OOB) Misclassification rate: 0.155599
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
Age                       0.0339    NA   NA
Ethnicity                 0.0271    NA   NA
Gender                    0.0239    NA   NA
EnglishDiff               0.0212    NA   NA
Health.Insurance          0.0201    NA   NA
Helpful.Family            0.0157    NA   NA
Discrimination            0.0150    NA   NA
Religion                  0.0145    NA   NA
Income_median             0.0137    NA   NA
EnglishSpeak              0.0132    NA   NA
Community.Shares.Values   0.0131    NA   NA
Helpful.Community         0.0123    NA   NA
Close.Family              0.0119    NA   NA
Religious.Importance      0.0112    NA   NA
Close.Friends             0.0104    NA   NA
See.Family                0.0104    NA   NA
Spend.Time.Together       0.0098    NA   NA
Religious.Attendance      0.0085    NA   NA
Helpful.Friends           0.0066    NA   NA
Employment                0.0065    NA   NA
Expression                0.0059    NA   NA
Successful.Family         0.0059    NA   NA
Community.Trust           0.0052    NA   NA
Trust                     0.0052    NA   NA
Close.knit.Community      0.0047    NA   NA
See.Friends               0.0040    NA   NA
rfobj$importance
                                  all Yes  0
Ethnicity                0.0271351212  NA NA
Age                      0.0339397455  NA NA
Gender                   0.0239012227  NA NA
Religion                 0.0144600197  NA NA
Employment               0.0065346238  NA NA
Income_median            0.0137359406  NA NA
EnglishSpeak             0.0132246656  NA NA
EnglishDiff              0.0211588431  NA NA
See.Family               0.0103506437  NA NA
Close.Family             0.0118872456  NA NA
Helpful.Family           0.0157320061  NA NA
See.Friends              0.0039800567  NA NA
Close.Friends            0.0104197808  NA NA
Helpful.Friends          0.0065852278  NA NA
Family.Respect           0.0006851545  NA NA
Similar.Values           0.0018920576  NA NA
Successful.Family        0.0058507574  NA NA
Trust                    0.0051714898  NA NA
Loyalty                  0.0019645267  NA NA
Family.Pride            -0.0012563461  NA NA
Expression               0.0059205347  NA NA
Spend.Time.Together      0.0098028332  NA NA
Feel.Close               0.0039296096  NA NA
Togetherness             0.0019433893  NA NA
Religious.Attendance     0.0084556719  NA NA
Religious.Importance     0.0111877913  NA NA
Close.knit.Community     0.0047058219  NA NA
Helpful.Community        0.0123436839  NA NA
Community.Shares.Values  0.0130716504  NA NA
Get.Along                0.0038690770  NA NA
Community.Trust          0.0051896773  NA NA
Health.Insurance         0.0200749381  NA NA
Dental.Insurance         0.0005655640  NA NA
Discrimination           0.0150385112  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

ggsave(filename="PChk_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
260.0000000 122.0000000   2.1311475   0.3193717   0.6065574   0.7000000 
       prec         npv    misclass       brier  brier.norm         auc 
  0.4868421   0.7913043   0.3298429   0.1947935   0.7791741   0.6841110 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.5772494   0.5401460   0.6254876   0.3193717   0.5048314   0.5958759 
 F1modgmean       gmean 
  0.6385467   0.6516058 
test_rf$importance
                                  all Yes  0
Ethnicity               -2.208910e-03  NA NA
Age                      1.268450e-02  NA NA
Gender                  -9.681833e-04  NA NA
Religion                 1.794378e-03  NA NA
Employment              -2.685932e-04  NA NA
Income_median            4.498425e-03  NA NA
EnglishSpeak             1.726043e-03  NA NA
EnglishDiff              1.334770e-03  NA NA
See.Family              -2.002236e-03  NA NA
Close.Family            -5.934351e-04  NA NA
Helpful.Family           2.611360e-03  NA NA
See.Friends             -7.542945e-04  NA NA
Close.Friends           -1.573132e-03  NA NA
Helpful.Friends          6.476850e-04  NA NA
Family.Respect          -5.441438e-04  NA NA
Similar.Values           3.513048e-04  NA NA
Successful.Family        2.577504e-04  NA NA
Trust                    1.661835e-04  NA NA
Loyalty                  3.849821e-04  NA NA
Family.Pride            -5.585242e-04  NA NA
Expression               4.167576e-05  NA NA
Spend.Time.Together      6.177973e-06  NA NA
Feel.Close              -8.730384e-04  NA NA
Togetherness            -7.008496e-04  NA NA
Religious.Attendance    -1.812429e-03  NA NA
Religious.Importance    -1.024334e-03  NA NA
Close.knit.Community     7.397242e-05  NA NA
Helpful.Community       -4.507673e-04  NA NA
Community.Shares.Values  1.940180e-04  NA NA
Get.Along               -9.124803e-05  NA NA
Community.Trust          8.462331e-05  NA NA
Health.Insurance         2.294108e-02  NA NA
Dental.Insurance         3.090500e-02  NA NA
Discrimination          -1.006681e-03  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

ggsave(filename="PChk_test_VIMP.png",width=5,height=5,units="in")

Dental Checkup

ps(`Dentist Check-up`)
# A tibble: 3 × 3
  `Dentist Check-up`     n   pct
  <fct>              <int> <dbl>
1 0                   1100 42.2 
2 Yes                 1462 56.0 
3 <NA>                  47  1.80

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Dentist.Check.up ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")
print(imb)
                         Sample size: 1915
           Frequency of class labels: 786, 1129
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 472.9973
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1210
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.4364
                   (OOB) Brier score: 0.19312832
        (OOB) Normalized Brier score: 0.77251328
                           (OOB) AUC: 0.76845319
                      (OOB) Log-loss: 0.5707148
                        (OOB) PR-AUC: 0.66442725
                        (OOB) G-mean: 0.69989128
   (OOB) Requested performance error: 0.30010872

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   566 220      0.2799
       Yes 361 768      0.3198

      (OOB) Misclassification rate: 0.3033943
plot(imb,plots.one.page = F)


                              all    0   Yes
Dental.Insurance           0.0207   NA    NA
Health.Insurance           0.0023   NA    NA
Income_median              0.0014   NA    NA
Helpful.Community          0.0012   NA    NA
EnglishDiff                0.0006   NA    NA
Community.Trust            0.0002   NA    NA
Spend.Time.Together       -0.0006   NA    NA
EnglishSpeak              -0.0006   NA    NA
Helpful.Friends           -0.0011   NA    NA
Employment                -0.0012   NA    NA
Family.Respect            -0.0014   NA    NA
Feel.Close                -0.0015   NA    NA
Togetherness              -0.0016   NA    NA
Loyalty                   -0.0017   NA    NA
Family.Pride              -0.0017   NA    NA
Helpful.Family            -0.0017   NA    NA
Religion                  -0.0020   NA    NA
Community.Shares.Values   -0.0021   NA    NA
Trust                     -0.0023   NA    NA
Close.Friends             -0.0025   NA    NA
Discrimination            -0.0026   NA    NA
Close.knit.Community      -0.0026   NA    NA
See.Family                -0.0028   NA    NA
Expression                -0.0031   NA    NA
Successful.Family         -0.0040   NA    NA
Get.Along                 -0.0042   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1129.0000000  786.0000000    1.4363868    0.4104439    0.7201018    0.6802480 
        prec          npv     misclass        brier   brier.norm          auc 
   0.6105717    0.7773279    0.3033943    0.1931283    0.7725133    0.7684532 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
   0.5707148    0.6608290    0.6916811    0.4104439    0.6644272    0.6803601 
  F1modgmean        gmean 
   0.6957862    0.6998913 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Dentist.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dentist.Check.up~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dentist.Check.up~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="auc")
print(rfobj)
                         Sample size: 1533
           Frequency of class labels: 754, 779
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 299.2413
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 969
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0332
                   (OOB) Brier score: 0.15203825
        (OOB) Normalized Brier score: 0.60815301
                           (OOB) AUC: 0.897689
                      (OOB) Log-loss: 0.47986324
                        (OOB) PR-AUC: 0.89419853
                        (OOB) G-mean: 0.81348703
   (OOB) Requested performance error: 0.18651297

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 616 138       0.183
       0   148 631       0.190

      (OOB) Misclassification rate: 0.1865623
print(rfobj)
                         Sample size: 1533
           Frequency of class labels: 754, 779
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 299.2413
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 969
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0332
                   (OOB) Brier score: 0.15203825
        (OOB) Normalized Brier score: 0.60815301
                           (OOB) AUC: 0.897689
                      (OOB) Log-loss: 0.47986324
                        (OOB) PR-AUC: 0.89419853
                        (OOB) G-mean: 0.81348703
   (OOB) Requested performance error: 0.18651297

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 616 138       0.183
       0   148 631       0.190

      (OOB) Misclassification rate: 0.1865623
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
Ethnicity                 0.0382    NA   NA
Religion                  0.0295    NA   NA
Dental.Insurance          0.0195    NA   NA
Religious.Importance      0.0189    NA   NA
EnglishDiff               0.0182    NA   NA
EnglishSpeak              0.0156    NA   NA
Gender                    0.0156    NA   NA
Helpful.Friends           0.0137    NA   NA
Religious.Attendance      0.0136    NA   NA
See.Friends               0.0130    NA   NA
Age                       0.0123    NA   NA
Close.Friends             0.0123    NA   NA
See.Family                0.0091    NA   NA
Helpful.Community         0.0084    NA   NA
Close.knit.Community      0.0084    NA   NA
Successful.Family         0.0084    NA   NA
Expression                0.0084    NA   NA
Get.Along                 0.0078    NA   NA
Similar.Values            0.0078    NA   NA
Helpful.Family            0.0071    NA   NA
Discrimination            0.0065    NA   NA
Spend.Time.Together       0.0065    NA   NA
Close.Family              0.0065    NA   NA
Community.Trust           0.0058    NA   NA
Community.Shares.Values   0.0052    NA   NA
Togetherness              0.0052    NA   NA
rfobj$importance
                                  all Yes  0
Ethnicity                3.819788e-02  NA NA
Age                      1.234635e-02  NA NA
Gender                   1.559907e-02  NA NA
Religion                 2.945558e-02  NA NA
Employment               5.166942e-03  NA NA
Income_median            2.563610e-03  NA NA
EnglishSpeak             1.559907e-02  NA NA
EnglishDiff              1.823009e-02  NA NA
See.Family               9.096205e-03  NA NA
Close.Family             6.467539e-03  NA NA
Helpful.Family           7.117568e-03  NA NA
See.Friends              1.298946e-02  NA NA
Close.Friends            1.234104e-02  NA NA
Helpful.Friends          1.367991e-02  NA NA
Family.Respect           4.534265e-03  NA NA
Similar.Values           7.791365e-03  NA NA
Successful.Family        8.423430e-03  NA NA
Trust                    4.524794e-03  NA NA
Loyalty                  1.304900e-03  NA NA
Family.Pride            -2.720675e-05  NA NA
Expression               8.422373e-03  NA NA
Spend.Time.Together      6.471758e-03  NA NA
Feel.Close               4.534265e-03  NA NA
Togetherness             5.173261e-03  NA NA
Religious.Attendance     1.364585e-02  NA NA
Religious.Importance     1.886188e-02  NA NA
Close.knit.Community     8.423430e-03  NA NA
Helpful.Community        8.426602e-03  NA NA
Community.Shares.Values  5.204855e-03  NA NA
Get.Along                7.801931e-03  NA NA
Community.Trust          5.816979e-03  NA NA
Health.Insurance         3.354449e-03  NA NA
Dental.Insurance         1.954565e-02  NA NA
Discrimination           6.471758e-03  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

ggsave(filename="DChk_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
225.0000000 157.0000000   1.4331210   0.4109948   0.8025478   0.7022222 
       prec         npv    misclass       brier  brier.norm         auc 
  0.6528497   0.8359788   0.2565445   0.1879495   0.7517980   0.7904600 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.5594116   0.7200000   0.7410109   0.4109948   0.6829062   0.7353555 
 F1modgmean       gmean 
  0.7458609   0.7507109 
test_rf$importance
                                  all Yes  0
Ethnicity                8.508110e-03  NA NA
Age                      8.271317e-03  NA NA
Gender                   1.358924e-03  NA NA
Religion                 6.568436e-03  NA NA
Employment              -4.547879e-04  NA NA
Income_median            4.703719e-03  NA NA
EnglishSpeak             8.740063e-03  NA NA
EnglishDiff              5.355667e-03  NA NA
See.Family               3.062052e-03  NA NA
Close.Family             2.657587e-03  NA NA
Helpful.Family           1.579786e-03  NA NA
See.Friends              3.530472e-03  NA NA
Close.Friends            1.913067e-03  NA NA
Helpful.Friends          2.426828e-03  NA NA
Family.Respect           4.885193e-04  NA NA
Similar.Values          -1.385758e-05  NA NA
Successful.Family        7.502447e-04  NA NA
Trust                    7.173918e-04  NA NA
Loyalty                  1.029924e-03  NA NA
Family.Pride             8.650342e-04  NA NA
Expression              -3.067024e-04  NA NA
Spend.Time.Together      2.456691e-03  NA NA
Feel.Close               3.199467e-04  NA NA
Togetherness             2.767349e-04  NA NA
Religious.Attendance     6.427582e-04  NA NA
Religious.Importance     9.328929e-04  NA NA
Close.knit.Community    -1.041299e-03  NA NA
Helpful.Community        5.545616e-04  NA NA
Community.Shares.Values -2.149310e-04  NA NA
Get.Along                7.956774e-04  NA NA
Community.Trust         -4.289379e-05  NA NA
Health.Insurance         6.140608e-03  NA NA
Dental.Insurance         6.922811e-02  NA NA
Discrimination           3.693327e-03  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

ggsave(filename="DChk_test_VIMP.png",width=5,height=5,units="in")

Urgent Care

ps(`Urgentcare`)
# A tibble: 3 × 3
  Urgentcare     n   pct
  <fct>      <int> <dbl>
1 0           2112 81.0 
2 Yes          440 16.9 
3 <NA>          57  2.18

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(`Urgentcare` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")
print(imb)
                         Sample size: 1908
           Frequency of class labels: 1594, 314
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 357.6643
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1206
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 5.0764
                   (OOB) Brier score: 0.13517241
        (OOB) Normalized Brier score: 0.54068965
                           (OOB) AUC: 0.59929952
                      (OOB) Log-loss: 0.43902022
                        (OOB) PR-AUC: 0.23059263
                        (OOB) G-mean: 0.56307797
   (OOB) Requested performance error: 0.43692203

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   818 776      0.4868
       Yes 120 194      0.3822

      (OOB) Misclassification rate: 0.4696017
plot(imb,plots.one.page = F)


                              all    0   Yes
Age                        0.0126   NA    NA
Family.Pride               0.0055   NA    NA
Discrimination             0.0055   NA    NA
Spend.Time.Together        0.0046   NA    NA
Health.Insurance           0.0046   NA    NA
Helpful.Family             0.0039   NA    NA
Close.Family               0.0037   NA    NA
Trust                      0.0022   NA    NA
Similar.Values             0.0022   NA    NA
Dental.Insurance           0.0008   NA    NA
Employment                 0.0003   NA    NA
Togetherness               0.0000   NA    NA
Income_median              0.0000   NA    NA
Helpful.Community         -0.0005   NA    NA
See.Friends               -0.0015   NA    NA
Loyalty                   -0.0021   NA    NA
Close.knit.Community      -0.0021   NA    NA
Expression                -0.0025   NA    NA
Community.Shares.Values   -0.0033   NA    NA
Feel.Close                -0.0036   NA    NA
Family.Respect            -0.0040   NA    NA
Helpful.Friends           -0.0060   NA    NA
Successful.Family         -0.0068   NA    NA
Ethnicity                 -0.0068   NA    NA
EnglishDiff               -0.0071   NA    NA
Gender                    -0.0072   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1594.0000000  314.0000000    5.0764331    0.1645702    0.6178344    0.5131744 
        prec          npv     misclass        brier   brier.norm          auc 
   0.2000000    0.8720682    0.4696017    0.1351724    0.5406897    0.5992995 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
   0.4390202    0.3021807    0.4117806    0.1645702    0.2305926    0.4326293 
  F1modgmean        gmean 
   0.4874293    0.5630780 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Urgentcare,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Urgentcare~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Urgentcare~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="auc")
print(rfobj)
                         Sample size: 1528
           Frequency of class labels: 751, 777
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 305.359
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 966
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0346
                   (OOB) Brier score: 0.15050494
        (OOB) Normalized Brier score: 0.60201977
                           (OOB) AUC: 0.93951437
                      (OOB) Log-loss: 0.48064091
                        (OOB) PR-AUC: 0.92901033
                        (OOB) G-mean: 0.86360265
   (OOB) Requested performance error: 0.13639735

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   640 111      0.1478
       Yes  97 680      0.1248

      (OOB) Misclassification rate: 0.1361257
print(rfobj)
                         Sample size: 1528
           Frequency of class labels: 751, 777
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 305.359
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 966
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0346
                   (OOB) Brier score: 0.15050494
        (OOB) Normalized Brier score: 0.60201977
                           (OOB) AUC: 0.93951437
                      (OOB) Log-loss: 0.48064091
                        (OOB) PR-AUC: 0.92901033
                        (OOB) G-mean: 0.86360265
   (OOB) Requested performance error: 0.13639735

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   640 111      0.1478
       Yes  97 680      0.1248

      (OOB) Misclassification rate: 0.1361257
plot(rfobj,plots.one.page = FALSE)


                             all    0   Yes
Ethnicity                 0.0292   NA    NA
Religion                  0.0258   NA    NA
Community.Shares.Values   0.0232   NA    NA
EnglishSpeak              0.0219   NA    NA
Dental.Insurance          0.0206   NA    NA
Close.Friends             0.0181   NA    NA
Religious.Attendance      0.0180   NA    NA
Income_median             0.0148   NA    NA
EnglishDiff               0.0147   NA    NA
Helpful.Community         0.0135   NA    NA
Get.Along                 0.0135   NA    NA
Helpful.Family            0.0132   NA    NA
Close.knit.Community      0.0127   NA    NA
See.Friends               0.0123   NA    NA
Community.Trust           0.0116   NA    NA
Religious.Importance      0.0114   NA    NA
Close.Family              0.0109   NA    NA
Helpful.Friends           0.0109   NA    NA
Togetherness              0.0104   NA    NA
Employment                0.0089   NA    NA
Family.Pride              0.0084   NA    NA
Age                       0.0082   NA    NA
Discrimination            0.0076   NA    NA
See.Family                0.0064   NA    NA
Expression                0.0063   NA    NA
Similar.Values            0.0063   NA    NA
rfobj$importance
                                  all  0 Yes
Ethnicity                0.0291504093 NA  NA
Age                      0.0081546189 NA  NA
Gender                   0.0057340537 NA  NA
Religion                 0.0257942205 NA  NA
Employment               0.0089111982 NA  NA
Income_median            0.0148154717 NA  NA
EnglishSpeak             0.0219076271 NA  NA
EnglishDiff              0.0146781895 NA  NA
See.Family               0.0064375131 NA  NA
Close.Family             0.0108874751 NA  NA
Helpful.Family           0.0132269812 NA  NA
See.Friends              0.0122813487 NA  NA
Close.Friends            0.0180530851 NA  NA
Helpful.Friends          0.0108633588 NA  NA
Family.Respect          -0.0002767768 NA  NA
Similar.Values           0.0063445514 NA  NA
Successful.Family        0.0062925772 NA  NA
Trust                    0.0062486015 NA  NA
Loyalty                 -0.0016068460 NA  NA
Family.Pride             0.0084190950 NA  NA
Expression               0.0063445514 NA  NA
Spend.Time.Together      0.0036739590 NA  NA
Feel.Close               0.0024163880 NA  NA
Togetherness             0.0104384214 NA  NA
Religious.Attendance     0.0179953247 NA  NA
Religious.Importance     0.0114493763 NA  NA
Close.knit.Community     0.0127434595 NA  NA
Helpful.Community        0.0134819482 NA  NA
Community.Shares.Values  0.0232494227 NA  NA
Get.Along                0.0134597741 NA  NA
Community.Trust          0.0116424583 NA  NA
Health.Insurance         0.0043946796 NA  NA
Dental.Insurance         0.0205984596 NA  NA
Discrimination           0.0075578444 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

ggsave(filename="UC_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
318.0000000  62.0000000   5.1290323   0.1631579   0.4354839   0.6132075 
       prec         npv    misclass       brier  brier.norm         auc 
  0.1800000   0.8478261   0.4157895   0.1381798   0.5527191   0.5213025 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.4504602   0.2547170   0.3751603   0.1631579   0.1688369   0.3857390 
 F1modgmean       gmean 
  0.4459607   0.5167611 
test_rf$importance
                                  all  0 Yes
Ethnicity               -4.404010e-04 NA  NA
Age                      6.294261e-03 NA  NA
Gender                  -1.319354e-03 NA  NA
Religion                -1.944322e-03 NA  NA
Employment               3.060151e-03 NA  NA
Income_median            4.626348e-05 NA  NA
EnglishSpeak            -2.650589e-03 NA  NA
EnglishDiff              6.401704e-03 NA  NA
See.Family               1.796675e-03 NA  NA
Close.Family             1.474619e-03 NA  NA
Helpful.Family           2.581160e-03 NA  NA
See.Friends             -1.812413e-03 NA  NA
Close.Friends            2.267354e-03 NA  NA
Helpful.Friends          7.554440e-04 NA  NA
Family.Respect          -9.647341e-04 NA  NA
Similar.Values          -2.101901e-05 NA  NA
Successful.Family        1.055152e-03 NA  NA
Trust                   -8.026600e-04 NA  NA
Loyalty                 -3.004785e-04 NA  NA
Family.Pride            -1.110644e-03 NA  NA
Expression              -1.192176e-03 NA  NA
Spend.Time.Together      1.265061e-04 NA  NA
Feel.Close              -5.187257e-04 NA  NA
Togetherness             2.747022e-04 NA  NA
Religious.Attendance    -2.154685e-03 NA  NA
Religious.Importance    -6.844490e-04 NA  NA
Close.knit.Community     1.337990e-03 NA  NA
Helpful.Community        5.608645e-03 NA  NA
Community.Shares.Values  5.225066e-04 NA  NA
Get.Along                1.170703e-03 NA  NA
Community.Trust          1.138418e-03 NA  NA
Health.Insurance         1.600614e-03 NA  NA
Dental.Insurance         6.147797e-03 NA  NA
Discrimination          -6.239883e-03 NA  NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

ggsave(filename="UC_test_VIMP.png",width=5,height=5,units="in")

Folk Medicine

ps(`Folkmedicine`)
# A tibble: 3 × 3
  Folkmedicine     n   pct
  <fct>        <int> <dbl>
1 0             2189 83.9 
2 Yes            348 13.3 
3 <NA>            72  2.76

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Folkmedicine ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")
print(imb)
                         Sample size: 1899
           Frequency of class labels: 1642, 257
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 306.211
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1200
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 6.3891
                   (OOB) Brier score: 0.11175568
        (OOB) Normalized Brier score: 0.44702273
                           (OOB) AUC: 0.67616364
                      (OOB) Log-loss: 0.37586696
                        (OOB) PR-AUC: 0.23627522
                        (OOB) G-mean: 0.62877744
   (OOB) Requested performance error: 0.37122256

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   970 672      0.4093
       Yes  85 172      0.3307

      (OOB) Misclassification rate: 0.3986309
plot(imb,plots.one.page = F)


                           all    0   Yes
Age                     0.0299   NA    NA
Ethnicity               0.0196   NA    NA
Helpful.Friends         0.0083   NA    NA
EnglishSpeak            0.0072   NA    NA
Togetherness            0.0066   NA    NA
Feel.Close              0.0062   NA    NA
Family.Pride            0.0061   NA    NA
Family.Respect          0.0060   NA    NA
Religion                0.0060   NA    NA
Close.knit.Community    0.0051   NA    NA
Community.Trust         0.0050   NA    NA
Trust                   0.0047   NA    NA
Close.Friends           0.0046   NA    NA
EnglishDiff             0.0046   NA    NA
See.Friends             0.0044   NA    NA
Employment              0.0042   NA    NA
Religious.Importance    0.0034   NA    NA
Dental.Insurance        0.0031   NA    NA
Loyalty                 0.0031   NA    NA
Get.Along               0.0026   NA    NA
Health.Insurance        0.0019   NA    NA
Similar.Values          0.0019   NA    NA
See.Family              0.0018   NA    NA
Expression              0.0011   NA    NA
Helpful.Community       0.0009   NA    NA
Helpful.Family         -0.0002   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1642.0000000  257.0000000    6.3891051    0.1353344    0.6692607    0.5907430 
        prec          npv     misclass        brier   brier.norm          auc 
   0.2037915    0.9194313    0.3986309    0.1117557    0.4470227    0.6761636 
     logloss           F1        F1mod  pr.auc.rand       pr.auc      F1gmean 
   0.3758670    0.3124432    0.4356551    0.1353344    0.2362752    0.4706103 
  F1modgmean        gmean 
   0.5322163    0.6287774 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Folkmedicine,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Folkmedicine~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Folkmedicine` ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="auc")
print(rfobj)
                         Sample size: 1520
           Frequency of class labels: 747, 773
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 289.0813
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 961
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0348
                   (OOB) Brier score: 0.13602902
        (OOB) Normalized Brier score: 0.54411608
                           (OOB) AUC: 0.94618318
                      (OOB) Log-loss: 0.44503014
                        (OOB) PR-AUC: 0.94488817
                        (OOB) G-mean: 0.86839658
   (OOB) Requested performance error: 0.13160342

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   632 115      0.1539
       Yes  84 689      0.1087

      (OOB) Misclassification rate: 0.1309211
print(rfobj)
                         Sample size: 1520
           Frequency of class labels: 747, 773
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 289.0813
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 961
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0348
                   (OOB) Brier score: 0.13602902
        (OOB) Normalized Brier score: 0.54411608
                           (OOB) AUC: 0.94618318
                      (OOB) Log-loss: 0.44503014
                        (OOB) PR-AUC: 0.94488817
                        (OOB) G-mean: 0.86839658
   (OOB) Requested performance error: 0.13160342

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   632 115      0.1539
       Yes  84 689      0.1087

      (OOB) Misclassification rate: 0.1309211
plot(rfobj,plots.one.page = FALSE)


                             all    0   Yes
Ethnicity                 0.0295   NA    NA
EnglishSpeak              0.0153   NA    NA
Religion                  0.0153   NA    NA
Religious.Importance      0.0135   NA    NA
Discrimination            0.0111   NA    NA
Community.Shares.Values   0.0103   NA    NA
EnglishDiff               0.0103   NA    NA
Close.knit.Community      0.0091   NA    NA
Religious.Attendance      0.0089   NA    NA
Age                       0.0086   NA    NA
Helpful.Community         0.0076   NA    NA
Helpful.Friends           0.0075   NA    NA
Dental.Insurance          0.0073   NA    NA
Close.Friends             0.0069   NA    NA
See.Friends               0.0064   NA    NA
Family.Pride              0.0061   NA    NA
Community.Trust           0.0058   NA    NA
Gender                    0.0053   NA    NA
Close.Family              0.0052   NA    NA
Get.Along                 0.0050   NA    NA
Spend.Time.Together       0.0046   NA    NA
Helpful.Family            0.0044   NA    NA
Loyalty                   0.0041   NA    NA
Trust                     0.0041   NA    NA
Feel.Close                0.0040   NA    NA
Family.Respect            0.0034   NA    NA
rfobj$importance
                                  all  0 Yes
Ethnicity                2.948927e-02 NA  NA
Age                      8.644268e-03 NA  NA
Gender                   5.327054e-03 NA  NA
Religion                 1.525852e-02 NA  NA
Employment               2.472414e-03 NA  NA
Income_median            5.755348e-04 NA  NA
EnglishSpeak             1.525852e-02 NA  NA
EnglishDiff              1.032383e-02 NA  NA
See.Family              -5.583749e-05 NA  NA
Close.Family             5.212687e-03 NA  NA
Helpful.Family           4.373457e-03 NA  NA
See.Friends              6.423337e-03 NA  NA
Close.Friends            6.897620e-03 NA  NA
Helpful.Friends          7.523027e-03 NA  NA
Family.Respect           3.380869e-03 NA  NA
Similar.Values           2.634425e-03 NA  NA
Successful.Family        2.472414e-03 NA  NA
Trust                    4.069851e-03 NA  NA
Loyalty                  4.069851e-03 NA  NA
Family.Pride             6.140097e-03 NA  NA
Expression               2.634425e-03 NA  NA
Spend.Time.Together      4.639075e-03 NA  NA
Feel.Close               4.009743e-03 NA  NA
Togetherness            -7.415481e-04 NA  NA
Religious.Attendance     8.867885e-03 NA  NA
Religious.Importance     1.349836e-02 NA  NA
Close.knit.Community     9.057301e-03 NA  NA
Helpful.Community        7.595451e-03 NA  NA
Community.Shares.Values  1.032383e-02 NA  NA
Get.Along                5.008068e-03 NA  NA
Community.Trust          5.790696e-03 NA  NA
Health.Insurance         1.176693e-04 NA  NA
Dental.Insurance         7.273614e-03 NA  NA
Discrimination           1.111232e-02 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

ggsave(filename="Folk_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
328.0000000  51.0000000   6.4313725   0.1345646   0.6078431   0.5762195 
       prec         npv    misclass       brier  brier.norm         auc 
  0.1823529   0.9043062   0.4195251   0.1152065   0.4608261   0.6171090 
    logloss          F1       F1mod pr.auc.rand      pr.auc     F1gmean 
  0.3875304   0.2805430   0.4011915   0.1345646   0.1712250   0.4361816 
 F1modgmean       gmean 
  0.4965058   0.5918201 
test_rf$importance
                                  all  0 Yes
Ethnicity                4.808677e-03 NA  NA
Age                      2.162911e-02 NA  NA
Gender                  -1.226642e-03 NA  NA
Religion                -2.983475e-05 NA  NA
Employment               2.670185e-03 NA  NA
Income_median            5.172185e-04 NA  NA
EnglishSpeak             9.061304e-03 NA  NA
EnglishDiff              3.219174e-03 NA  NA
See.Family              -3.480972e-04 NA  NA
Close.Family            -6.965218e-04 NA  NA
Helpful.Family          -5.081808e-03 NA  NA
See.Friends             -3.446965e-03 NA  NA
Close.Friends            3.575321e-03 NA  NA
Helpful.Friends         -1.391379e-03 NA  NA
Family.Respect           3.186908e-04 NA  NA
Similar.Values           9.355895e-04 NA  NA
Successful.Family        5.843771e-04 NA  NA
Trust                   -6.103914e-04 NA  NA
Loyalty                 -9.823498e-04 NA  NA
Family.Pride            -3.188944e-03 NA  NA
Expression              -3.027179e-04 NA  NA
Spend.Time.Together     -2.377881e-03 NA  NA
Feel.Close              -3.196601e-03 NA  NA
Togetherness             1.070865e-04 NA  NA
Religious.Attendance    -3.521653e-03 NA  NA
Religious.Importance    -1.907589e-03 NA  NA
Close.knit.Community    -1.757698e-03 NA  NA
Helpful.Community       -1.920971e-03 NA  NA
Community.Shares.Values -4.778563e-03 NA  NA
Get.Along               -2.845168e-03 NA  NA
Community.Trust         -2.973031e-03 NA  NA
Health.Insurance        -8.981269e-04 NA  NA
Dental.Insurance        -1.803215e-03 NA  NA
Discrimination           4.059630e-03 NA  NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
  theme_bw()

importance_plot

ggsave(filename="Folk_test_VIMP.png",width=5,height=5,units="in")